In [33]:
# load enrgenics-quantlet-import
source("enrgenics-quantlet-import.r")
In [34]:
file = "data/sales_revenue_1.csv"
df = load_eia_data(file)

head(df)
Out[34]:
YearMonthStateDataStatusDateCatRevenueSalesCustomersPriceID
1.RESIDENTIAL19901AKFinal631195200RESIDENTIAL1747718175209.621
2.RESIDENTIAL19901ALFinal631195200RESIDENTIAL123332207009305.962
3.RESIDENTIAL19901ARFinal631195200RESIDENTIAL72506102632007.063
4.RESIDENTIAL19901AZFinal631195200RESIDENTIAL109332139649907.834
5.RESIDENTIAL19901CAFinal631195200RESIDENTIAL597161616800909.685
6.RESIDENTIAL19901COFinal631195200RESIDENTIAL71325104680506.816
In [59]:
totals = subset(df, DataStatus=="Final")
# tmp = dcast(totals[which(totals$Cat == "TOTAL"),], Year + Month ~ State + Cat, value.var="Sales", fun.aggregate = sum, na.rm = TRUE)
ts_year_by_state = dcast(totals[which(totals$Cat == "TOTAL"),], Year ~ State, value.var="Sales", fun.aggregate = sum)
corr_states = cor(ts_year_by_state[, c(-1,-ncol(ts_year_by_state))])
means_by_state = colMeans(ts_year_by_state[,-1])
Out[59]:
  1. 2712554662
  2. 2762003050
  3. 2763365474
  4. 2861462336
  5. 2934562867
  6. 3013286575
  7. 3101127032
  8. 3145610430
  9. 3264230749
  10. 3312087083
  11. 3421414256
  12. 3394458107
  13. 3465466011
  14. 3493734498
  15. 3547479479
  16. 3660968521
  17. 3669918842
  18. 3764560731
  19. 3733964624
  20. 3596795150
  21. 3754841377
  22. 3749846184
  23. 3694649791
  24. 3724867839
  25. 3764700296
In [20]:
heatmap(corr_states, Rowv=NA, Colv=NA, col = cm.colors(256), margins=c(5,10))
In [67]:
#http://www.r-bloggers.com/ggplot2-cheatsheet-for-scatterplots/
library(ggplot2)
# plot options for Jupyter Notebook
options(repr.plot.width = 12, repr.plot.height = 5)

# Plot Set 1
totals = subset(df, DataStatus=="Final" & Cat == "TOTAL")
totals$DateFormat = as.Date(as.POSIXct(totals$Date, origin="1970-01-01"))
totals_sub = subset(totals, State=="DE")
totals_sub$DateFormat = as.Date(as.POSIXct(totals_sub$Date, origin="1970-01-01"))
p1 = ggplot(totals_sub, aes(x = DateFormat, y = Revenue))
p1 + geom_point(aes(color=factor(Year)))
ggsave("output/plt1a.png", width = 16, height = 9, dpi = 120)
p1 + geom_point(aes(color=factor(Month)))
ggsave("output/plt1b.png", width = 16, height = 9, dpi = 120)
p1c = ggplot(totals, aes(x = DateFormat, y = Revenue))
p1c + geom_point(aes(color=factor(State)))
ggsave("output/plt1c.png", width = 30, height = 20, dpi = 120)


# Plot Set 2
forp2 = ts_year_by_state
forp2$total = apply(ts_year_by_state[,2:ncol(ts_year_by_state)], 1,function(x){sum(x)})
p2 = ggplot(forp2, aes(x = Year, y = total))
p2 + geom_point(aes(color=factor(Year)))
In [16]:
# consider weather data
# https://ram-n.github.io/weatherData/

# mapping without advanced libs
# http://stackoverflow.com/questions/24441775/how-do-you-create-a-us-states-heatmap-based-on-some-values

library(MASS)
library(maps)
library(maptools)
library(sp)

permit = gpclibPermit()
means_df = data.frame(means_by_state)
dat = means_df
dat$states <- tolower(state.name[match(rownames(dat),  state.abb)])
mapUSA <- map('state',  fill = TRUE,  plot = FALSE)
nms <- sapply(strsplit(mapUSA$names,  ':'),  function(x){x[1]})
USApolygons <- map2SpatialPolygons(mapUSA,  IDs = nms,  CRS('+proj=longlat'))

idx <- match(unique(nms),  dat$states)
dat2 <- data.frame(value = dat$means_by_state[idx], state = unique(nms))
row.names(dat2) <- unique(nms)
USAsp <- SpatialPolygonsDataFrame(USApolygons,  data = dat2)
    
spplot(USAsp['value'])
# https://www.census.gov/popest/data/datasets.html
# https://docs.google.com/spreadsheets/d/1JsV5bNnFoE-4xapsnrpqRP-5wgTkpDjgC0SXKW8zgPI/edit#gid=0
In [ ]: